library(tidyverse)  # for data manipulation and plotting
library(rvest)      # for webscraping

# ------------------------------------------------------------------------------
# set up for plotting
# ------------------------------------------------------------------------------

theme_set(theme_minimal())

# colour palette
highlight_colour <-  "#35469D"

For publishing on github: https://liuyanguu.github.io/post/2021/01/06/rmarkdown-to-github-pages/

intro

Tennis result data

Read the data from github

key points - 2 tours in separate repos - For each tour each years match results are in a separate csv - objective: create a dataframe of results for from alls years for each tour

#' Get WTA or ATP tour match results from the Tennis Abstract github for the specified year (https://github.com/JeffSackmann)
#'
#' @param year an integer. The earliest years where there are data: wta 1920; atp 1968.
#' @param tour a string either 'wta' or 'atp'
#'
#' @return a dataframe of match results 

get_res <- function(year, tour){

  # form file location string based on year and tour
  repo_url <- glue::glue("https://raw.githubusercontent.com/JeffSackmann/tennis_{tour}/master/")
  file_name <- glue::glue("{tour}_matches_{year}.csv")
  file_location <- str_c(repo_url, file_name)
  #browser()

  # read in data from csv on github
  res_year <- read_csv(file_location,
                      
                      # address issue with column type being read differently
                      # in different files
                      col_types = list(winner_seed = col_double(),
                                         loser_seed = col_double(),
                                       draw_size = col_double())) %>% 
    
    # make variable names consistent
    janitor::clean_names() %>%

    # add identifiers
    mutate(tour = tour, 
           year = year)

  return(res_year)
}

# ------------------------------------------------------------------------------
# download results
# ------------------------------------------------------------------------------

start_year <- 1968
end_year <-  2022

# aggregate atp results into a single dataframe
atp_results <- seq(start_year, end_year) %>% 
  map_df(~get_res(., "atp"))

# aggregate wta results into a single dataframe
wta_results <- seq(start_year, end_year) %>% 
  map_df(~get_res(., "wta"))

Tidy the data

Key points: - observations are matches - the players identified as values of the following variables winner_id, winner_name, loser_id, loser_name. - Single variables split across two columns (e.g. winner_age and loser_age actual a single variable age) winner_ioc and loser_ioc - Move to observations being a single player’s match results

#' Get observations relating to either match winners or losers from 
#' tennis abstract match raw result data
#'
#' @param raw_results_df a dataframe of tennis abstract match result raw data
#' @param result specifies whether `winner` or `loser`data should be returned
#'
#' @return a dataframe of match data for either winners or losers

get_data_by_result <- function(raw_results_df, result){
  
  res_to_retain <- result
  
  if(res_to_retain == "winner"){
    res_to_retain_suffix <- "w_"
    res_to_drop <- "loser"
    res_to_drop_suffix <- "l_"
  }
  
  else if (res_to_retain == "loser"){
    res_to_retain_suffix <- "l_"
    res_to_drop <- "winner"
    res_to_drop_suffix <- "w_"
  }
  
  # tidy up data for match losers
  raw_results_df %>%

    # drop columns containing match stats of the winner
    select(-c(starts_with(res_to_drop), starts_with(res_to_drop_suffix))) %>%

    # simplify column naming to enable row binding of winners and losers
    rename_with(.fn = ~str_replace( .x, str_c(res_to_retain, "_"), ""),
                .cols = starts_with(res_to_retain)) %>%
    rename_with(.fn = ~str_replace( .x, res_to_retain_suffix, ""),
                .cols = starts_with(res_to_retain_suffix)) %>%

    # add result column and match_id
    mutate(result = res_to_retain,
           match_id = 1:n(),
           match_id_str = str_c(match_id, tour, year, sep = "_"),
           .after = name)
  
}


#' Puts tennis abstract match raw result data in a tidy form
#'
#' @param raw_results_df a dataframe of tennis abstract match result raw data
#'
#' @return a tidy dataframe of tennis abstract match results data

tidy_match_results <- function(raw_results_df){
  
  losers <- get_data_by_result(raw_results_df, "loser")
  winners <- get_data_by_result(raw_results_df, "winner")
  
  results_tidy <- 
    
    # recombine data on winning and losing players
    bind_rows(losers, winners) %>% 
    
    # order for readability
    arrange(year, tour, match_id) %>% 
    
    # convert variable types where needed
    mutate(tourney_date = lubridate::ymd(tourney_date))
  
  return(results_tidy)
}

# ------------------------------------------------------------------------------
# tidy results
# ------------------------------------------------------------------------------

atp_results_tidy <- tidy_match_results(atp_results)
wta_results_tidy <- tidy_match_results(wta_results)

# create single dataframe with results from both tours
all_results_tidy <- bind_rows(
  atp_results_tidy, wta_results_tidy
)

Focus on relevant data

results_focused <- all_results_tidy %>% 
  
  # focus on entrants to grand slam tournaments
  filter(tourney_level == "G") %>% 
  
  # focus on variables of interest
  select(year, tourney_name, tour, name, id, 
         country_code = ioc) %>% 
  
  # remove duplicates where players have appeared in multiple rounds 
  unique()

results_focused

Check completeness of data

  • I had assumed there had always been 128 plays in each singles draw (so a total of 256 players at each Grandslam)
  • Looking smaller draw size before 1989. Checking historical records of tournament draw I could see that some did have mens or womens draws with less than 128 entrants.
  • Rather than manually check all thesizes of all the smaller draws are correct, I decided to refine the scope of the analysis to look at Grand Slam tournaments from 1990 onwards (where there is no missing data)
  • also identified and issue with tournament names being different for the US Open Men’s and Women’s Singles in 2019. String matching issue
# look for missing data in post 1968 results
results_focused %>% 
  count(year, tourney_name) %>% 
  filter(n != 256)
# focus on Grand Slams from 1990 onwards
results_clean <- results_focused %>% 
  filter(year >= 1990) %>% 
  mutate(tourney_name = replace(tourney_name, 
                                tourney_name == "Us Open",
                                "US Open"))

# confirm there is no missing data for Grand Slams 1990 onwards
num_missing <- results_clean %>% 
  count(year, tourney_name) %>% 
  filter(n != 256) %>% 
  nrow()

assertthat::assert_that(num_missing == 0)
## [1] TRUE

GDP data

Read in, tidy and focus data

Source: Our World in Data

gdp_focus <- read_csv("data/gdp-per-capita-worldbank.csv") %>% 
  
  # make variable names consistent
  janitor::clean_names() %>% 
  
  # rename variables to make them easier to work with 
  rename(gdp_per_capita = gdp_per_capita_ppp_constant_2017_international,
         country = entity,
         country_code = code) %>% 
  
  # focus on 1990 onwards in line with Grand Slam entrant data
  filter(year >= 1990)

# remove entities which are not countries (e.g. regions or continents)
# i.e. those which do not have country codes
non_country_entities <- gdp_focus %>% 
  filter(is.na(country_code)) %>% 
  distinct(country) %>% 
  .$country

gdp_focus <- gdp_focus %>% 
  filter(!country %in% non_country_entities) %>% 
  filter(country != "World")

Check completeness of data

look for explicitly missing data and implicitly missing data

# -----------------------------------------------------------------------------
# explore explicitly missing data
# -----------------------------------------------------------------------------
visdat::vis_miss(gdp_focus) +
  labs(title = "No explicitly missing data")

# -----------------------------------------------------------------------------
# explore implicitly missing data
# -----------------------------------------------------------------------------

# create a dataframe with all combinations of year and country
gdp_focus_exp_grid <-  expand_grid(country = unique(gdp_focus$country),
            year = unique(gdp_focus$year)) %>%
  
  # add in gdp data where available
  left_join(gdp_focus)

# show implicitly missing data
gdp_missing <- gdp_focus_exp_grid %>% 
  filter(is.na(gdp_per_capita))
gdp_missing
visdat::vis_miss(gdp_focus_exp_grid) +
  labs(title = "But there is implicitly missing data")

### exploring and imputing missing data

  • where data is missing, in most cases it is missing for a number of years starting at 1990 and then followed by complete data up until 2020. Based on visual inspection of the dataframe and plot (few lines with breaks in the middle) below.
# look at countries with missing data
gdp_missing %>% 
  count(country)
countries_gdp_missing <- unique(gdp_missing$country)

# look at where the missing data is in each country's time series
gdp_missing %>% 
  arrange(country, year)
p <- gdp_focus_exp_grid %>% 
  
  # only need to plot countries with missing data
  filter(country %in% countries_gdp_missing) %>% 
  
  # create the plot
  ggplot(aes(year, gdp_per_capita, group = country)) +
    geom_line(alpha = 0.8, colour = highlight_colour)

# make interactive with tooltips for data exploration
plotly::ggplotly(p)

Given the pattern of the missing data get strange behavior trying to use an interpolation function

library(zoo)

strange_interpolation <- c("Kosovo", "Cayman Islands", "Iceland")

p <- gdp_focus_exp_grid %>% 
  mutate(
    gdp_per_capita = zoo::na.approx(gdp_per_capita),
    highlight = country %in% strange_interpolation) %>% 
  # only need to plot countries with missing data
  filter(country %in% countries_gdp_missing) %>% 
  
  # create the plot
  ggplot(aes(year, gdp_per_capita, 
             group = country, 
             colour = highlight)) +
    geom_line(alpha = 0.8) +
  scale_colour_manual(values = c("#C7C2C4", highlight_colour)) +
  theme(legend.position = "none")

# make interactive with tooltips for data exploration
plotly::ggplotly(p)

So use a simple method, based on the pattern of missing data

# -----------------------------------------------------------------------------
# find the earliest record of gdp_per_capita for each country
# -----------------------------------------------------------------------------

earliest_gdp <- gdp_focus_exp_grid %>% 
  
  # focus on relevant observations
  filter(!is.na(gdp_per_capita),
         country %in% countries_gdp_missing) %>% 
  
  # identify earliest year with gdp_per_cap observation
  group_by(country) %>% 
  mutate(min_year = min(year)) %>% 
  ungroup() %>% 
  
  # focus only on each countries earliest year with gdp_per_cap observation
  filter(year == min_year) %>% 
  
  # retain only the columns needed to simplify the join below
  select(earliest_gdp_per_capita = gdp_per_capita,
         country_code,
         country)

# -----------------------------------------------------------------------------
# for each country replace missing gdp_per_capita data with earliest 
# gdp_per_cap observation
# -----------------------------------------------------------------------------

gdp_clean <- gdp_focus_exp_grid %>% 
  
  left_join(earliest_gdp, by = c("country" = "country")) %>% 
  
  # replace nas
  mutate(gdp_per_capita = 
           if_else(
              is.na(gdp_per_capita), 
              earliest_gdp_per_capita, 
              gdp_per_capita
              ),
    
         country_code =
           if_else(
              is.na(country_code.x),
              country_code.y, 
              country_code.x
              )
    ) %>% 
  
  # tidy up dataframe
  arrange(country,year) %>% 
  select(-c(country_code.x, country_code.y, earliest_gdp_per_capita))

Join the two datasets

#' Quickly check how well two dataframes join
#'
#' @param df1 the dataframe passed as the first argument to the 
#' @param df2 
#' @param join_func the specific join function to be applied (e.g. left_join)
#' @param ... other parameters to be passed to the join function
#' 
#' @return a vector including the number of observations which are of join successes and failures

join_performance <- function(df1, df2, join_func, ...){
  
  # perform the join
  join <- df1 %>%
    join_func(df2, ...)
  
  # the number of observations which are of join failures
  num_join_miss <- join %>% 
    filter(is.na(gdp_per_capita)) %>% 
    distinct(country_code) %>% 
    nrow()
  
  # the number of observations which are of join successes
  num_join_hit <- join %>% 
    filter(!is.na(gdp_per_capita)) %>% 
    distinct(country_code) %>% 
    nrow()

  # return the number of observations which are of join successes and failures
  c("join_sucesses" = num_join_hit, "join_failures" = num_join_miss)
}

#-------------------------------------------------------------------------------
# prepare the two data frames for joining
#-------------------------------------------------------------------------------
# gdp data only goes up to 2020, so remove 2021 and 2022 match data
results_clean <- filter(results_clean, year <= 2020) %>% 
  # remove any white space ahead of using to join with gdp data
  mutate(country_code = str_trim(country_code, side = "both"))

# remove any white space ahead of using to join with tennis data
gdp_clean <- gdp_clean %>% 
  mutate(country_code = str_trim(country_code, side = "both"))
  
#-------------------------------------------------------------------------------
# look at initial join performance
#-------------------------------------------------------------------------------
join_performance(results_clean, 
  gdp_clean,left_join)
## join_sucesses join_failures 
##            55            35
#-------------------------------------------------------------------------------
# web scrape a look up table for mapping between iso country codes (gdp data)
# and ioc codes (tennis data)
#-------------------------------------------------------------------------------

lookup_raw <- read_html("https://simple.wikipedia.org/wiki/Comparison_of_IOC,_FIFA,_and_ISO_3166_country_codes") %>% 
  html_nodes(xpath="/html/body/div[3]/div[3]/div[5]/div[1]/table[1]") %>% 
  html_table() %>% 
  pluck(1)

iso_ioc_lookup <- lookup_raw %>%
  janitor::clean_names() %>% 
  select(ioc, iso)

#-------------------------------------------------------------------------------
# replace iso country codes in gdp data with ioc codes (as used in tennis data)
#-------------------------------------------------------------------------------

gdp_clean_ioc <- gdp_clean %>% 
  left_join(iso_ioc_lookup,
            by = c("country_code" = "iso"))

# just one country without an ioc code so ignore
gdp_clean_ioc %>% 
  filter(is.na(ioc))
gdp_clean_ioc <- gdp_clean_ioc %>% 
  mutate(country_code = ioc) %>% 
  select(-ioc)

#-------------------------------------------------------------------------------
# check if join performance has improved
#-------------------------------------------------------------------------------

join_performance(results_clean,
                 gdp_clean_ioc,left_join)
## join_sucesses join_failures 
##            81             9
# look at players/countries where no match to gdp data

no_match <- results_clean %>% 
  left_join(gdp_clean_ioc) %>% 
  filter(is.na(gdp_per_capita)) 

no_match %>% 
  distinct(year, name, country_code)
no_match %>% 
  distinct(country_code)
unique(gdp_clean_ioc$country_code)
##   [1] "AFG" "ALB" "ALG" "ANG" "ANT" "ARG" "ARM" "ARU" "AUS" "AUT" "AZE" "BAH"
##  [13] "BRN" "BAN" "BAR" "BLR" "BEL" "BIZ" "BEN" "BER" "BHU" "BOL" "BIH" "BOT"
##  [25] "BRA" "BRU" "BUL" "BUR" "BDI" "CAM" "CMR" "CAN" "CPV" "CAY" "CAF" "CHA"
##  [37] "CHI" "CHN" "COL" "COM" "CGO" "CRC" "CIV" "CRO" ""    "CYP" "CZE" "COD"
##  [49] "DEN" "DJI" "DMA" "DOM" "ECU" "EGY" "ESA" "GEQ" "EST" "SWZ" "ETH" "FIJ"
##  [61] "FIN" "FRA" "GAB" "GAM" "GEO" "GER" "GHA" "GRE" "GRN" "GUA" "GUI" "GBS"
##  [73] "GUY" "HAI" "HON" "HKG" "HUN" "ISL" "IND" "INA" "IRI" "IRQ" "IRL" "ISR"
##  [85] "ITA" "JAM" "JPN" "JOR" "KAZ" "KEN" "KIR" NA    "KUW" "KGZ" "LAO" "LAT"
##  [97] "LIB" "LES" "LBR" "LBA" "LTU" "LUX" "MAD" "MAW" "MAS" "MDV" "MLI" "MLT"
## [109] "MHL" "MTN" "MRI" "MEX" "FSM" "MDA" "MGL" "MNE" "MAR" "MOZ" "MYA" "NAM"
## [121] "NRU" "NEP" "NED" "NZL" "NCA" "NIG" "NGR" "MKD" "NOR" "OMA" "PAK" "PLW"
## [133] "PLE" "PAN" "PNG" "PAR" "PER" "PHI" "POL" "POR" "PUR" "QAT" "ROU" "RUS"
## [145] "RWA" "SKN" "LCA" "VIN" "SAM" "SMR" "STP" "KSA" "SEN" "SRB" "SEY" "SLE"
## [157] "SGP" "SVK" "SLO" "SOL" "SOM" "RSA" "KOR" "ESP" "SRI" "SUD" "SUR" "SWE"
## [169] "SUI" "TJK" "TAN" "THA" "TLS" "TOG" "TGA" "TTO" "TUN" "TUR" "TKM" "TUV"
## [181] "UGA" "UKR" "UAE" "GBR" "USA" "URU" "UZB" "VAN" "VIE" "ZAM" "ZIM"
# -----------------------------------------------------------------------------
# Identify predecessor countries and check for the names of countries with 
# no gdp data
# -----------------------------------------------------------------------------
# set up string for searching within country names
country_check <- c("ven" = c("venezuela"),
                   "yug" = c("yugoslavia"),
                   "cub" = c("cuba"),
                   "mon" = c("monaco"),
                   "tpe" = c("taiwan"),
                   "tch" = c("czechoslovakia", "cz"),
                   "urs" = c("soviet union", "ussr", "russia"),
                   "scg" = c("serbia", "montenegro"),
                   "lie" = c("liechtenstein", "liech", "germany"))

# combine country search strings into a single regular expression
# | is OR
country_check_str <- str_c(country_check, collapse = "|")

# search strings found in the following countries present in the dataset 
matches_found <- gdp_clean_ioc %>% 
  mutate(country_check = str_detect(str_to_lower(country), country_check_str)) %>% 
  filter(country_check) %>% 
  distinct(country, country_code)
matches_found
# -----------------------------------------------------------------------------
# Where possible find a country that approximately matches for countries with 
# no gdp data
# -----------------------------------------------------------------------------
country_code_replacement_lookup <- matches_found %>% 
  mutate(code_to_replace = c("TCH", "LIE", NA, "URS", "SCG" )) %>% 
  rename(replacement_code = country_code) %>% 
  select(-country) %>% 
  filter(!is.na(code_to_replace)) 

results_clean <- results_clean %>% 
  left_join(country_code_replacement_lookup, 
            by = c("country_code" = "code_to_replace")) %>% 
  mutate(country_code = if_else(!is.na(replacement_code),
                                replacement_code, 
                                country_code)) %>% 
  select(-replacement_code)

#-------------------------------------------------------------------------------
# check if join performance has improved
#-------------------------------------------------------------------------------

join_performance(results_clean,
                 gdp_clean_ioc,left_join)
## join_sucesses join_failures 
##            81             5
# good enough
results_gdp <- left_join(results_clean, gdp_clean_ioc)

results_gdp %>% 
  relocate(gdp_per_capita, country) %>% 
  visdat::vis_miss()

# -----------------------------------------------------------------------------
# calculate number of first round grand slam entrants for each country each
# year
# -----------------------------------------------------------------------------
num_gs_entrants <- results_gdp %>% 
  group_by(year, country) %>% 
  mutate(num_first_rd = n()) %>% 
  ungroup() %>% 
  select(-c(id, name, tour, tourney_name)) %>% 
  distinct() %>% 
  arrange(year, desc(num_first_rd))

num_gs_entrants
# num of countries represented in first round of grand slam singles
num_countries <- num_gs_entrants %>% 
  count(year) %>% 
  rename(num_countries_represented = n)

# -----------------------------------------------------------------------------
# create exploratory plots
# -----------------------------------------------------------------------------

ggplot(num_countries, aes(year, num_countries_represented)) +
  geom_col()

p <- ggplot(num_gs_entrants, aes(year, num_first_rd, group = country)) +
  geom_line(alpha = 0.5)

plotly::ggplotly(p) # make interactive to explore which line is which country
# calculated country income deciles for each country
# use gdp data (all countries) rather than tennis data (subset of countries)
income_deciles <- gdp_clean_ioc %>% 
  group_by(year) %>% 
  mutate(income_decile = ntile(gdp_per_capita, 10)) %>% 
  ungroup() %>% 
  select(-c(country, gdp_per_capita))

# add the newly calculated deciles into the tennis/gdp dataframe
num_gs_entrants_inc_dec <- num_gs_entrants %>% 
  left_join(income_deciles) 

#' Plot the percentage of Grand Slam First Round appearances made by players 
#' from countries in the top n percentage of highest income
#'
#' @param df a dataframe containing the grand slam first round appearance data
#' @param decile integer between 1 and 10

plot_top_n_perc <- function(df, decile){

  # create a variable names and plot title based on the decile chosen
  top_n_perc <- (10 - decile + 1) * 10
  var_str <- str_c("top_", top_n_perc, "_perc")
  plot_title <- str_c("The top ", top_n_perc, "% highest income countries\n")

  # test if each observation is for specified decile or a higher decile
  df[, var_str] <- 
    df$income_decile >= decile
  
  # calculate number of first round appearance for top n % of countries
  df_plot <- df %>%
    
    na.omit() %>%
    
    group_by(year, !!sym(var_str)) %>%
    summarise(num_first_rd = sum(num_first_rd)) %>%
    ungroup() %>%
    
    group_by(year) %>%
    mutate(perc_first_rd = num_first_rd / sum(num_first_rd) * 100) %>% 
    ungroup() %>% 
    filter(!!sym(var_str))
  
  # create the plot itself
  df_plot %>%
    ggplot(aes(year, perc_first_rd)) +
      geom_line(colour = "#374E83") +
      
      # tidy up labelling of the plot
      labs(x = NULL, y = NULL,
           
           # put y axis label at the top of the y axis
           subtitle = "Percentage of first round\nGrand Slam appearances\n",
           
           # identify the top n % for the plot
           title = plot_title) +
      
      # tidy up axis
      scale_y_continuous(labels = scales::label_percent(scale = 1),
                         limits = c(0,100)) +
      scale_x_continuous(breaks = seq(1990, 2020, 5)) +
      coord_cartesian(expand = FALSE, clip = "off") +
      
      # tidy positioning of text and gridlines
      theme(panel.grid.minor.x = element_blank(),
            plot.title.position = "plot",
            plot.margin = margin(t = 20, r = 30, b = 20, l = 20, unit = "pt"))

}

#-------------------------------------------------------------------------------
# create exploratory plots for the top n percent of highest income countries
#-------------------------------------------------------------------------------

seq(10,1) %>% 
  map(~plot_top_n_perc(num_gs_entrants_inc_dec, .))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

# -----------------------------------------------------------------------------
# calculate number of first round entrants for each grand slam for each country 
# each year
# -----------------------------------------------------------------------------
num_gs_tourney_entrants <- results_gdp %>% 
  
  group_by(year, tourney_name, country) %>% 
  mutate(num_first_rd = n()) %>% 
  ungroup() %>%
  
  select(-c(id, name, tour)) %>% 
  distinct() %>% 
  arrange(year, tourney_name, desc(num_first_rd)) %>% 
  
  # add income deciles
  left_join(income_deciles) %>% 
  
  # test if observation relates to a top 20% income country
  mutate(top_20_perc = income_decile >= 9)

num_gs_tourney_entrants
# -----------------------------------------------------------------------------
# create main data layer for the plot
# -----------------------------------------------------------------------------

# calculate number of first round appearance for top n % of countries
df_plot <- num_gs_tourney_entrants %>%
    
    na.omit() %>%
    
    group_by(year, tourney_name, top_20_perc) %>%
    summarise(num_first_rd = sum(num_first_rd)) %>%
    ungroup() %>%
    
    group_by(year, tourney_name) %>%
    mutate(perc_first_rd = num_first_rd / sum(num_first_rd) * 100) %>% 
    ungroup() %>% 
    filter(top_20_perc)

# -----------------------------------------------------------------------------
# create additional data layers for the plot
# -----------------------------------------------------------------------------
annotation_layer <- df_plot %>% 
  group_by(tourney_name) %>% 
  mutate(min_perc_first_rd = min(perc_first_rd),
         min_year = min(year),
         max_year = max(year)) %>% 
  ungroup() %>% 
  
  filter(min_perc_first_rd == perc_first_rd |
          year == min_year |
           year == max_year)

# -----------------------------------------------------------------------------
# create the plot itself
# -----------------------------------------------------------------------------
p <- df_plot %>%
    ggplot(aes(year, 
               perc_first_rd,
               colour = tourney_name)) +
      geom_line() +
      geom_point(data = annotation_layer) + 
      ggrepel::geom_text_repel(data = annotation_layer,
                                mapping = aes(label = round(perc_first_rd))) +
      
      # tidy up labelling of the plot
      labs(x = NULL, y = NULL,
           
           # put y axis label at the top of the y axis
           subtitle = "Percentage of first round\nGrand Slam appearances\n",
           
           # identify the top n % for the plot
           title = "plot_title") +
  
      scale_colour_manual(values = c("#6A8EAF", "#D97A53", 
                                     "#35469D", "#216E4B")) +
      
      # tidy up axis
      scale_y_continuous(labels = scales::label_percent(scale = 1),
                         limits = c(50,100)) +
      scale_x_continuous(breaks = seq(1990, 2020, 5)) +
      coord_cartesian(expand = FALSE, clip = "off") +
      
      # tidy positioning of text and gridlines
      theme(panel.grid.minor = element_blank(),
            panel.spacing.x = unit(2, "cm"),
            panel.spacing.y = unit(1, "cm"),
            plot.title.position = "plot",
            #plot.margin = margin(t = 20, r = 30, b = 20, l = 20, unit = "pt"),
            legend.position = "none",
            strip.text = element_text(margin = margin(10,5,10,5,"pt")),
            plot.margin = unit(c(2,2,2,2), 'cm')) +
  
      facet_wrap(~tourney_name)

p

ggsave("images/perc_top_20.svg", width = 297, height = 210, units = "mm")